Bring in the phyloseq object

#source("Scripts/MakePhyloseqObject.R")
load("IntermediateData/larval_phyloseq.RData")

Required libraries

library(phyloseq)
library(DESeq2)
library(dplyr)
library(ggplot2)
library(vegan)
library(ggrepel)
library(cowplot)
 library(lme4)
library(lmerTest)
 library(broom.mixed)
library(plotly)
library(tidyverse)

Better names for taxa

pull_lowlev <- function(taxaVec){
  taxaVec <- dplyr::select(Kingdom:Genus)
  taxaVec <- na.omit(taxaVec)
  taxaVec[length(taxaVec)]
}

theTaxa <- ps %>% tax_table() %>% as("matrix") %>% data.frame()  #%>% as.data.frame()
theTaxa[] <- lapply(theTaxa, as.character)
theTaxa$ASVNum <- rownames(theTaxa) %>% parse_number
theTaxa$ASVName <- rownames(theTaxa)


myLowist <- theTaxa %>% pivot_longer(Kingdom:Genus) %>% na.omit %>% group_by(ASVName) %>% summarise(Lowist = last(value))

theTaxa <- theTaxa %>% left_join(myLowist, by = "ASVName")
theTaxa <- theTaxa %>% mutate(JName = str_c(Lowist, ASVNum, sep = "_"))
theTaxa <- theTaxa %>% column_to_rownames("ASVName")
head(theTaxa)


tt2 <- tax_table(theTaxa)
Coercing from data.frame class to character matrix 
prior to building taxonomyTable. 
This could introduce artifacts. 
Check your taxonomyTable, or coerce to matrix manually.
rownames(tt2) <- rownames(theTaxa)
colnames(tt2) <- colnames(theTaxa)
ps_retax <- ps
tax_table(ps_retax) <- tt2

Remove blanks

ps_noblank <- subset_samples(ps_retax, Strain != "Blank")
ps_plusone <- transform_sample_counts(ps_noblank, function(x) x + 1)

Convert to relative abundance

psra <- ps %>% transform_sample_counts( function(x) x/sum(x))

Normalize microibal counts to oyster counts

# ps_oyster <- ps %>% subset_taxa(Order == "Ostreoida")
# ps_not_oyster <- ps %>% subset_taxa(Order != "Ostreoida")
# oyster_sums <- otu_table(ps_oyster)@.Data %>% apply(MARGIN = 2, sum)
# not_oyster_counts <- otu_table(ps_not_oyster)@.Data
# 
# over_oyster <- sweep(not_oyster_counts, 2, oyster_sums, "/")
# over_oyster_log10 <- log10(over_oyster)

This is a variance normalizing tranformaiton. Its apparently an alternative to rarifying the data.

deseq_pre <- phyloseq_to_deseq2(ps, design = ~ Project)
converting counts to integer mode
# deseq_counts <- estimateSizeFactors(deseq_pre, type = "poscounts")
# deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
# vst_trans_count_tab <- assay(deseq_counts_vst)

# Ella on Slack
#deseq_pre
dds = deseq_pre[rowSums(counts(deseq_pre)) > 5,]
dds_esf <- estimateSizeFactors(dds, type = "poscounts")
dds01 <- DESeq(dds_esf)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 851 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
dds_res <- results(dds01)

dds_counts <- counts(dds01, normalized = TRUE)

ps_dds <- ps_retax
otu_table(ps_dds) <- otu_table(dds_counts, taxa_are_rows = TRUE)

Select only species that show up in at least 20% of the samples.

ps_common <- filter_taxa(ps_dds, function(x) sum(x > 2) > (0.2*length(x)), TRUE)
ps_common
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 463 taxa and 35 samples ]
sample_data() Sample Data:       [ 35 samples by 6 sample variables ]
tax_table()   Taxonomy Table:    [ 463 taxa by 9 taxonomic ranks ]

Devide the abundance of each microbial ASV by the host gene copy number. Now microbial genes are reported as ratios to host, rather than counts. This process adjusts for compositionality.

Normalize microibal counts to oyster counts

ps_oyster <- ps_dds %>% subset_taxa(Order == "Ostreoida")
ps_not_oyster <- ps_common %>% subset_taxa(Order != "Ostreoida") # ok if not common, so just using dds
oyster_sums <- otu_table(ps_oyster)@.Data %>% apply(MARGIN = 2, sum)
not_oyster_counts <- otu_table(ps_not_oyster)@.Data

over_oyster <- sweep(not_oyster_counts, 2, oyster_sums, "/")
# so this suddently contains zeros, making everything fail, but it didn't used to. What gives?
over_oyster_log10 <- log10(over_oyster)
(detection_thresh <- min(na.omit(over_oyster[over_oyster > 0])))
[1] 7.091444e-06
over_oyster_log10 <- log10(over_oyster + detection_thresh)

ps_oo <- ps_not_oyster
otu_table(ps_oo) <- otu_table(over_oyster, taxa_are_rows = TRUE)
ps_oo <- subset_samples(ps_oo, (SampleID %in% c(names(oyster_sums[oyster_sums > 0])))) # remove cases

ps_oo_log <- ps_not_oyster
otu_table(ps_oo_log) <- otu_table(over_oyster_log10, taxa_are_rows = TRUE)

ps_oo_log_ss <- subset_samples(ps_oo_log, !(Project =="Mock"& Strain == "Even" & Run == "NoCrash"))
ps_oo_ss <- subset_samples(ps_oo, !(Project =="Mock"& Strain == "Even" & Run == "NoCrash"))

Make the one figure for the paper

A function I use

converts phyloseq sample data to a data frame

samd_to_df <- function(samd){
  df <- samd %>% sample_data %>% .@.Data %>% lapply(as.character) %>% data.frame
  colnames(df) <- samd@names
  rownames(df) <- samd@row.names
  df
}

Actually run the RDA

Redundancy analysis (RDA) is an ordination approach that allows us to look at how the microbial community as a whole varies across our variables of interest.


ps_oo_log_ss1 <- ps_oo_log_ss %>% subset_samples(((Run =="NoCrash" & Project == "NoCrash") | (Run == "Crash4" & Project == "Crash")) & Treatment %in% c("Fed", "Starve", "Pre"))

test_rda <- rda(t(otu_table(ps_oo_log_ss1)) ~ Project + as.factor(Strain == "Wild") + as.factor(Treatment == "Fed"), data = samd_to_df(sample_data(ps_oo_log_ss1)))
test_rda
Call: rda(formula = t(otu_table(ps_oo_log_ss1)) ~ Project +
as.factor(Strain == "Wild") + as.factor(Treatment == "Fed"), data =
samd_to_df(sample_data(ps_oo_log_ss1)))

               Inertia Proportion Rank
Total         398.6378     1.0000     
Constrained   219.1358     0.5497    3
Unconstrained 179.5020     0.4503   24
Inertia is variance 

Eigenvalues for constrained axes:
  RDA1   RDA2   RDA3 
168.98  41.07   9.09 

Eigenvalues for unconstrained axes:
  PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8 
45.55 28.02 17.16 13.76  8.03  6.92  5.75  5.37 
(Showing 8 of 24 unconstrained eigenvalues)
test_rda_anova <- anova(test_rda, by = "margin", permutations = how(nperm = 99999))
test_rda_anova
Permutation test for rda under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 99999

Model: rda(formula = t(otu_table(ps_oo_log_ss1)) ~ Project + as.factor(Strain == "Wild") + as.factor(Treatment == "Fed"), data = samd_to_df(sample_data(ps_oo_log_ss1)))
                              Df Variance       F  Pr(>F)    
Project                        1  154.281 20.6278   1e-05 ***
as.factor(Strain == "Wild")    1   18.224  2.4367 0.05050 .  
as.factor(Treatment == "Fed")  1   37.999  5.0806 0.00249 ** 
Residual                      24  179.502                    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
myScores <- scores(test_rda, choices = c(1:4), scaling = "symmetric")

Post procesing of RDA

Sample Data

mySamples <- left_join(
myScores$sites %>% data.frame %>% rownames_to_column("Sample"),
ps_oo_log_ss1 %>% sample_data() %>% samd_to_df %>% rownames_to_column("Sample"),
by = "Sample"
)

Percent variance explained

eigsum <- sum(c(test_rda$CCA$eig, test_rda$CA$eig))
cca_eig <- test_rda$CCA$eig / eigsum
ca_eig <- test_rda$CA$eig / eigsum
all_eig <- c(cca_eig, ca_eig)
all_eig["RDA1"] 
     RDA1 
0.4238992 
rda_eig_pct <- data.frame(all_eig) %>% rownames_to_column("Ax") %>% dplyr::rename(EigPct = "all_eig") %>%
  mutate(Axis2 = ordered(Ax, levels = Ax))
rda_eig_pct

Species

Select which species we will show in the figure, targeting ones that are far from 0, 0.

mySpecies <- left_join(
  myScores$species %>% data.frame %>% rownames_to_column("ASV"),
ps_oo_log_ss1 %>% tax_table() %>% .@.Data %>% as.data.frame %>% rownames_to_column("ASV")
) %>% mutate(RDADist = sqrt(RDA1^2 + RDA2^2 + RDA3 ^2)) %>% 
  arrange(-RDADist) %>%
  head(10) %>%
  mutate(Rank = 1:10)
Joining, by = "ASV"

Calculate Centroids

Centroids tell us about the different treatment types and how they relate to the samples and species.

myCent <- myScores$centroids %>% data.frame %>% rownames_to_column("Treatment") %>%
  tidyr::extract(Treatment, c("Type", "Condition"), "([A-Z][a-z]+)([A-Z].*)",  remove = FALSE) %>%
  mutate(Type = if_else(str_detect(Treatment, "Wild"), "Strain", Type)) %>%
  mutate(Condition = if_else(Type == "Strain" & str_detect(Treatment, "FALSE"), "Tame", Condition)) %>%
  mutate(Condition = if_else(Type == "Strain" & str_detect(Treatment, "TRUE"), "Wild", Condition)) %>%
  mutate(Type = if_else(str_detect(Treatment, "Fed"), "Feeding", Type)) %>%
  mutate(Condition = if_else(Type == "Feeding" & str_detect(Treatment, "FALSE"), "StarvedOrPre", Condition)) %>%
  mutate(Condition = if_else(Type == "Feeding" & str_detect(Treatment, "TRUE"), "Fed", Condition))

myCent2 <- myCent %>% filter((Condition %in% c("Fed", "Wild", "Crash")))

Main Figure

This actually plots the main figure, using ggplot

ccaPlot_1V2_A <- mySamples %>% ggplot(aes(x = RDA1, y = RDA2)) +
  geom_point(size = 3, stroke = 3, aes(shape = Project, color = Strain == "Wild", fill = Treatment), alpha = 1) +
  scale_shape_manual(values = c(21:25)) + scale_color_manual(values = c("gray40", "black")) + scale_fill_manual(values = c(Fed = "Blue", Pre = "DarkGreen", Starve = "Orange", Crash = "Pink", NoCrash = "White", Wild = "White"))+
  geom_point(data = mySpecies, size = 4, shape = "+") + 
  ggrepel::geom_text_repel(data = mySpecies, aes(label = JName) , size = 3) +
  guides(fill = guide_legend(override.aes = list(shape = 21)), color = guide_legend(override.aes = list(shape = 21))) +
  theme(legend.position = "bottom")

ccaLegend <- get_legend(ccaPlot_1V2_A)

ccaPlot_1V2_B <- ccaPlot_1V2_A +
  geom_label(data = myCent2, aes(label = Condition, x = RDA1 * 1.75, y = RDA2 * 2, fill = Condition) , size = 5) +
  geom_segment(data = myCent2, aes(x = 0, y = 0, xend = RDA1 * 2.5, yend = RDA2 * 2.5), arrow = arrow(length = unit(0.1, "in")), alpha = 0.5, size = 1) +
  coord_fixed(sqrt(test_rda$CCA$eig[2]/test_rda$CCA$eig[1])) +
  labs(x = paste0("RDA1", " (", scales::percent(all_eig["RDA1"]), ")"),
       y = paste0("RDA2", " (", scales::percent(all_eig["RDA2"]), ")")
       ) +
  cowplot::theme_cowplot() + theme(legend.position = "none")

#plot_grid(ccaPlot_1V2_B, ccaLegend)
ProtoNewFig <- plot_grid(ccaPlot_1V2_B, ccaLegend, nrow = 2, rel_heights = c(10,1))
ggsave("Figures/ProtoNewFig.svg", ProtoNewFig, width = 8, height = 6)

View Main Figure

The main figure is an .svg file, but here is a draft view for this workbook.

ProtoNewFig

I’m not sure why “NoCrash” and “Wild” showed up in the legend. It didn’t used to do that, but I’m not going to bother to correct this right now.

Seeing which species relate to crash vs non-crash

We don’t show these figures in the paper, but we do refer to them.

Initial data wrangling

ps_oo_log_ss2 <- ps_oo_log_ss1
sample_data(ps_oo_log_ss2) <-  mySamples %>% column_to_rownames("Sample") %>% sample_data()

Reshaping to long takes a little while. (~ 20 seconds)

melt_oo_log_ss2 <- psmelt(ps_oo_log_ss2)
melt2_oo_log_ss2 <- melt_oo_log_ss2 %>%
  mutate(logAbundance = Abundance) %>%
  mutate(Abundance = 10^(Abundance))
melt2_oo_log_ss2 <- melt2_oo_log_ss2 %>% left_join(mySpecies %>% select(RDA1.Spec = RDA1, JName), by = "JName")
melt2_oo_log_ss2 %>% head

Stuff

Run an lme to see if each microbe is related to project, holding out treatment as a mixed effect.

modframe <- melt2_oo_log_ss2 %>% select(Project:Group, logAbundance, Kingdom:Genus, JName ) %>% group_by(JName) %>% nest(data = Project:logAbundance) %>%
  #mutate(mod = map(data, ~tidy(lm(data = ., logAbundance ~ Project)))) %>%
  mutate(lme = map(data, ~tidy(lmer(data = ., logAbundance ~ Project + (1|Treatment) + (1|Strain)))))
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular
boundary (singular) fit: see ?isSingular

Ignore the many warning messages, and don’t ask me what they mean.

And visualize the results

modframe01 <- modframe %>% unnest(lme) %>% filter(term == "ProjectNoCrash") %>% select(Kingdom:JName, estimate, std.error, p.value) %>% mutate(fdr = p.adjust(p.value, method = "BH"))
ggplotly(
ggplot(modframe01, aes(x = estimate, y = log10(p.value),  color = Kingdom, JName = JName)) + geom_point() + 
  scale_color_manual(values = c(Bacteria = "Gray10", Eukaryota = "blue", Archaea = "red")) +
  geom_hline(aes(yintercept = log10(0.01)))
)

Figure 2. Not shown int the paper. Estimate is the size of the coefficint in a linear model. log10 P value tells about significance.

Everything below the line is statistically significant. Mouse over the dots to see which bacteria are which. If plotly is giving you problems, comment out the ggplotly bits and this shows up as a normal plot. But then you can’t mouse over poitns.

How many significant and non significant ASVs are there?

modframe01 %>% ungroup %>% summarise(signif = sum(p.value < 0.01), total = length(p.value)) %>% mutate(frachits = signif/total) 

63 % of the asvs are related to treatment p < 0.01.

---
title: "R Notebook"
output: html_notebook
---

# Bring in the phyloseq object

```{r}
#source("Scripts/MakePhyloseqObject.R")
load("IntermediateData/larval_phyloseq.RData")
```

# Required libraries

```{r}
library(phyloseq)
library(DESeq2)
library(dplyr)
library(ggplot2)
library(vegan)
library(ggrepel)
library(cowplot)
 library(lme4)
library(lmerTest)
 library(broom.mixed)
library(plotly)
library(tidyverse)

```

# Better names for taxa



```{r}
pull_lowlev <- function(taxaVec){
  taxaVec <- dplyr::select(Kingdom:Genus)
  taxaVec <- na.omit(taxaVec)
  taxaVec[length(taxaVec)]
}

theTaxa <- ps %>% tax_table() %>% as("matrix") %>% data.frame()  #%>% as.data.frame()
theTaxa[] <- lapply(theTaxa, as.character)
theTaxa$ASVNum <- rownames(theTaxa) %>% parse_number
theTaxa$ASVName <- rownames(theTaxa)


myLowist <- theTaxa %>% pivot_longer(Kingdom:Genus) %>% na.omit %>% group_by(ASVName) %>% summarise(Lowist = last(value))

theTaxa <- theTaxa %>% left_join(myLowist, by = "ASVName")
theTaxa <- theTaxa %>% mutate(JName = str_c(Lowist, ASVNum, sep = "_"))
theTaxa <- theTaxa %>% column_to_rownames("ASVName")
head(theTaxa)


tt2 <- tax_table(theTaxa)
rownames(tt2) <- rownames(theTaxa)
colnames(tt2) <- colnames(theTaxa)
ps_retax <- ps
tax_table(ps_retax) <- tt2
```

Remove blanks
```{r}
ps_noblank <- subset_samples(ps_retax, Strain != "Blank")
ps_plusone <- transform_sample_counts(ps_noblank, function(x) x + 1)
```

Convert to relative abundance
```{r}
psra <- ps %>% transform_sample_counts( function(x) x/sum(x))
```

Normalize microibal counts to oyster counts
```{r}
# ps_oyster <- ps %>% subset_taxa(Order == "Ostreoida")
# ps_not_oyster <- ps %>% subset_taxa(Order != "Ostreoida")
# oyster_sums <- otu_table(ps_oyster)@.Data %>% apply(MARGIN = 2, sum)
# not_oyster_counts <- otu_table(ps_not_oyster)@.Data
# 
# over_oyster <- sweep(not_oyster_counts, 2, oyster_sums, "/")
# over_oyster_log10 <- log10(over_oyster)
```

This is a variance normalizing tranformaiton. Its apparently an alternative to rarifying the data. 
```{r}
deseq_pre <- phyloseq_to_deseq2(ps, design = ~ Project)
# deseq_counts <- estimateSizeFactors(deseq_pre, type = "poscounts")
# deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts)
# vst_trans_count_tab <- assay(deseq_counts_vst)

# Ella on Slack
#deseq_pre
dds = deseq_pre[rowSums(counts(deseq_pre)) > 5,]
dds_esf <- estimateSizeFactors(dds, type = "poscounts")
dds01 <- DESeq(dds_esf)
dds_res <- results(dds01)

dds_counts <- counts(dds01, normalized = TRUE)

ps_dds <- ps_retax
otu_table(ps_dds) <- otu_table(dds_counts, taxa_are_rows = TRUE)
```

Select only species that show up in at least 20% of the samples. 

```{r}
ps_common <- filter_taxa(ps_dds, function(x) sum(x > 2) > (0.2*length(x)), TRUE)
ps_common
```

Devide the abundance of each microbial ASV by the host gene copy number. Now microbial genes are reported as ratios to host, rather than counts.
This process adjusts for compositionality.

Normalize microibal counts to oyster counts

```{r}
ps_oyster <- ps_dds %>% subset_taxa(Order == "Ostreoida")
ps_not_oyster <- ps_common %>% subset_taxa(Order != "Ostreoida") # ok if not common, so just using dds
oyster_sums <- otu_table(ps_oyster)@.Data %>% apply(MARGIN = 2, sum)
not_oyster_counts <- otu_table(ps_not_oyster)@.Data

over_oyster <- sweep(not_oyster_counts, 2, oyster_sums, "/")
# so this suddently contains zeros, making everything fail, but it didn't used to. What gives?
over_oyster_log10 <- log10(over_oyster)
(detection_thresh <- min(na.omit(over_oyster[over_oyster > 0])))
over_oyster_log10 <- log10(over_oyster + detection_thresh)

ps_oo <- ps_not_oyster
otu_table(ps_oo) <- otu_table(over_oyster, taxa_are_rows = TRUE)
ps_oo <- subset_samples(ps_oo, (SampleID %in% c(names(oyster_sums[oyster_sums > 0])))) # remove cases

ps_oo_log <- ps_not_oyster
otu_table(ps_oo_log) <- otu_table(over_oyster_log10, taxa_are_rows = TRUE)

ps_oo_log_ss <- subset_samples(ps_oo_log, !(Project =="Mock"& Strain == "Even" & Run == "NoCrash"))
ps_oo_ss <- subset_samples(ps_oo, !(Project =="Mock"& Strain == "Even" & Run == "NoCrash"))
```

# Make the one figure for the paper

## A function I use
converts phyloseq sample data to a data frame
```{r}
samd_to_df <- function(samd){
  df <- samd %>% sample_data %>% .@.Data %>% lapply(as.character) %>% data.frame
  colnames(df) <- samd@names
  rownames(df) <- samd@row.names
  df
}
```

## Actually run the RDA
Redundancy analysis (RDA) is an ordination approach that allows us to look at how the microbial community as a whole varies across our variables of interest.
```{r}

ps_oo_log_ss1 <- ps_oo_log_ss %>% subset_samples(((Run =="NoCrash" & Project == "NoCrash") | (Run == "Crash4" & Project == "Crash")) & Treatment %in% c("Fed", "Starve", "Pre"))

test_rda <- rda(t(otu_table(ps_oo_log_ss1)) ~ Project + as.factor(Strain == "Wild") + as.factor(Treatment == "Fed"), data = samd_to_df(sample_data(ps_oo_log_ss1)))
test_rda
test_rda_anova <- anova(test_rda, by = "margin", permutations = how(nperm = 99999))
test_rda_anova
myScores <- scores(test_rda, choices = c(1:4), scaling = "symmetric")
```

## Post procesing of RDA

### Sample Data
```{r}
mySamples <- left_join(
myScores$sites %>% data.frame %>% rownames_to_column("Sample"),
ps_oo_log_ss1 %>% sample_data() %>% samd_to_df %>% rownames_to_column("Sample"),
by = "Sample"
)
```

### Percent variance explained
```{r}
eigsum <- sum(c(test_rda$CCA$eig, test_rda$CA$eig))
cca_eig <- test_rda$CCA$eig / eigsum
ca_eig <- test_rda$CA$eig / eigsum
all_eig <- c(cca_eig, ca_eig)
all_eig["RDA1"] 
```

```{r}
rda_eig_pct <- data.frame(all_eig) %>% rownames_to_column("Ax") %>% dplyr::rename(EigPct = "all_eig") %>%
  mutate(Axis2 = ordered(Ax, levels = Ax))
rda_eig_pct
```

### Species
Select which species we will show in the figure, targeting ones that are far from 0, 0.
```{r}
mySpecies <- left_join(
  myScores$species %>% data.frame %>% rownames_to_column("ASV"),
ps_oo_log_ss1 %>% tax_table() %>% .@.Data %>% as.data.frame %>% rownames_to_column("ASV")
) %>% mutate(RDADist = sqrt(RDA1^2 + RDA2^2 + RDA3 ^2)) %>% 
  arrange(-RDADist) %>%
  head(10) %>%
  mutate(Rank = 1:10)
```

## Calculate Centroids
Centroids tell us about the different treatment types and how they relate to the samples and species.
```{r}
myCent <- myScores$centroids %>% data.frame %>% rownames_to_column("Treatment") %>%
  tidyr::extract(Treatment, c("Type", "Condition"), "([A-Z][a-z]+)([A-Z].*)",  remove = FALSE) %>%
  mutate(Type = if_else(str_detect(Treatment, "Wild"), "Strain", Type)) %>%
  mutate(Condition = if_else(Type == "Strain" & str_detect(Treatment, "FALSE"), "Tame", Condition)) %>%
  mutate(Condition = if_else(Type == "Strain" & str_detect(Treatment, "TRUE"), "Wild", Condition)) %>%
  mutate(Type = if_else(str_detect(Treatment, "Fed"), "Feeding", Type)) %>%
  mutate(Condition = if_else(Type == "Feeding" & str_detect(Treatment, "FALSE"), "StarvedOrPre", Condition)) %>%
  mutate(Condition = if_else(Type == "Feeding" & str_detect(Treatment, "TRUE"), "Fed", Condition))

myCent2 <- myCent %>% filter((Condition %in% c("Fed", "Wild", "Crash")))
```

### Main Figure
This actually plots the main figure, using ggplot
```{r}
ccaPlot_1V2_A <- mySamples %>% ggplot(aes(x = RDA1, y = RDA2)) +
  geom_point(size = 3, stroke = 3, aes(shape = Project, color = Strain == "Wild", fill = Treatment), alpha = 1) +
  scale_shape_manual(values = c(21:25)) + scale_color_manual(values = c("gray40", "black")) + scale_fill_manual(values = c(Fed = "Blue", Pre = "DarkGreen", Starve = "Orange", Crash = "Pink", NoCrash = "White", Wild = "White"))+
  geom_point(data = mySpecies, size = 4, shape = "+") + 
  ggrepel::geom_text_repel(data = mySpecies, aes(label = JName) , size = 3) +
  guides(fill = guide_legend(override.aes = list(shape = 21)), color = guide_legend(override.aes = list(shape = 21))) +
  theme(legend.position = "bottom")

ccaLegend <- get_legend(ccaPlot_1V2_A)

ccaPlot_1V2_B <- ccaPlot_1V2_A +
  geom_label(data = myCent2, aes(label = Condition, x = RDA1 * 1.75, y = RDA2 * 2, fill = Condition) , size = 5) +
  geom_segment(data = myCent2, aes(x = 0, y = 0, xend = RDA1 * 2.5, yend = RDA2 * 2.5), arrow = arrow(length = unit(0.1, "in")), alpha = 0.5, size = 1) +
  coord_fixed(sqrt(test_rda$CCA$eig[2]/test_rda$CCA$eig[1])) +
  labs(x = paste0("RDA1", " (", scales::percent(all_eig["RDA1"]), ")"),
       y = paste0("RDA2", " (", scales::percent(all_eig["RDA2"]), ")")
       ) +
  cowplot::theme_cowplot() + theme(legend.position = "none")

#plot_grid(ccaPlot_1V2_B, ccaLegend)
ProtoNewFig <- plot_grid(ccaPlot_1V2_B, ccaLegend, nrow = 2, rel_heights = c(10,1))
ggsave("Figures/ProtoNewFig.svg", ProtoNewFig, width = 8, height = 6)
```

### View Main Figure

The main figure is an .svg file, but here is a draft view for this workbook.
```{r}
ProtoNewFig
```

I'm not sure why "NoCrash" and "Wild" showed up in the legend. It didn't used to do that, but I'm not going to bother to correct this right now.

# Seeing which species relate to crash vs non-crash
We don't show these figures in the paper, but we do refer to them.

## Initial data wrangling

```{r}
ps_oo_log_ss2 <- ps_oo_log_ss1
sample_data(ps_oo_log_ss2) <-  mySamples %>% column_to_rownames("Sample") %>% sample_data()
```

Reshaping to long takes a little while. (~ 20 seconds)
```{r}
melt_oo_log_ss2 <- psmelt(ps_oo_log_ss2)
```

```{r}
melt2_oo_log_ss2 <- melt_oo_log_ss2 %>%
  mutate(logAbundance = Abundance) %>%
  mutate(Abundance = 10^(Abundance))
melt2_oo_log_ss2 <- melt2_oo_log_ss2 %>% left_join(mySpecies %>% select(RDA1.Spec = RDA1, JName), by = "JName")
melt2_oo_log_ss2 %>% head
```

## Stuff

Run an lme to see if each microbe is related to project, holding out treatment as a mixed effect.

```{r}
modframe <- melt2_oo_log_ss2 %>% select(Project:Group, logAbundance, Kingdom:Genus, JName ) %>% group_by(JName) %>% nest(data = Project:logAbundance) %>%
  #mutate(mod = map(data, ~tidy(lm(data = ., logAbundance ~ Project)))) %>%
  mutate(lme = map(data, ~tidy(lmer(data = ., logAbundance ~ Project + (1|Treatment) + (1|Strain)))))
```
Ignore the many warning messages, and don't ask me what they mean.

And visualize the results
```{r}
modframe01 <- modframe %>% unnest(lme) %>% filter(term == "ProjectNoCrash") %>% select(Kingdom:JName, estimate, std.error, p.value) %>% mutate(fdr = p.adjust(p.value, method = "BH"))
```

```{r}
ggplotly(
ggplot(modframe01, aes(x = estimate, y = log10(p.value),  color = Kingdom, JName = JName)) + geom_point() + 
  scale_color_manual(values = c(Bacteria = "Gray10", Eukaryota = "blue", Archaea = "red")) +
  geom_hline(aes(yintercept = log10(0.01)))
)
```

Figure 2. Not shown int the paper. Estimate is the size of the coefficint in a linear model. log10 P value tells about significance. 

Everything below the line is statistically significant. Mouse over the dots to see which bacteria are which. If plotly is giving you problems, comment out the `ggplotly` bits and this shows up as a normal plot. But then you can't mouse over poitns.

How many significant and non significant ASVs are there?

```{r}
modframe01 %>% ungroup %>% summarise(signif = sum(p.value < 0.01), total = length(p.value)) %>% mutate(frachits = signif/total) 
```
63 % of the asvs are related to treatment p < 0.01.